II. Code to edit and execute using the Code-along.Rmd file


A. Data Wrangling


1. Loading packages (Slide #16)

# Load package tidyverse
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

2. Loading data-set (Slide #16)

# Read data from the hotels.csv file and assign it to a variable named, "hotels"
hotels <- read_csv("hotels.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

3. List names of the variables in the data-set (Slide #19)

# Enter code here
names(hotels)
##  [1] "hotel"                          "is_canceled"                   
##  [3] "lead_time"                      "arrival_date_year"             
##  [5] "arrival_date_month"             "arrival_date_week_number"      
##  [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
##  [9] "stays_in_week_nights"           "adults"                        
## [11] "children"                       "babies"                        
## [13] "meal"                           "country"                       
## [15] "market_segment"                 "distribution_channel"          
## [17] "is_repeated_guest"              "previous_cancellations"        
## [19] "previous_bookings_not_canceled" "reserved_room_type"            
## [21] "assigned_room_type"             "booking_changes"               
## [23] "deposit_type"                   "agent"                         
## [25] "company"                        "days_in_waiting_list"          
## [27] "customer_type"                  "adr"                           
## [29] "required_car_parking_spaces"    "total_of_special_requests"     
## [31] "reservation_status"             "reservation_status_date"

4. Glimpse of contents of the data-set (Slide #20)

# Enter code here
glimpse(hotels)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type                  <chr> "Transient", "Transient", "Transient", …
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…


B. Choosing rows or columns


5. Select a single column (Slide #24)

# Enter code here
select(hotels, lead_time)

6. Select multiple columns (Slide #25)

# Enter code here
select(hotels, lead_time,agent,market_segment)

7. Arrange entries of a column (Slide #28)

# Enter code here
arrange(hotels, lead_time)

8. Arrange entries of a column in the descending order (Slide #30)

# Enter code here
arrange(hotels, desc(lead_time))

9. Select columns and arrange the entries of a column (Slide #31)

# Enter code here
arrange(select(hotels, lead_time), desc(lead_time))

10. Select columns and arrange the entries of a column using the pipe operator (Slide #37)

# Enter code here
arrange( select(hotels, lead_time),desc(lead_time))

11. Pick rows matching a condition (Slide #44)

# Enter code here
hotels %>%
  filter(children >= 1) %>%
    select(hotel, children)

12. Pick rows matching multiple conditions (Slide #46)

# Enter code here
hotels %>%
  filter(children >= 1,hotel == "City Hotel") %>%
       select(hotel, children)

13. Non-conditional selection of rows: sequence of indices (Slide #49)

# Enter code here
hotels %>% slice(1:5)

14. Non-conditional selection of rows: non-consecutive/specific indices (Slide #50)

# Enter code here
hotels %>%
  slice(1,3,5)

15. Pick unique rows using distinct() (Slide #52)

# Enter code here
hotels %>% distinct(hotel)


C. Creating new columns


16. Creating a single column with mutate() (Slide #56)

# Enter code here
hotels %>%
  mutate(little_ones = children + babies) %>%
    select(hotel, little_ones,children,babies)

17. Creating multiple columns with mutate() (Slide #58)

# Enter code here
hotels %>%
  mutate(little_ones = children + babies,
          average_little_ones = mean(little_ones)) %>%
    select(hotel, little_ones,children,babies, average_little_ones)


D. More operations with examples


18. count() to get frequencies (Slide #60)

# Enter code here
hotels %>%
  count(market_segment)

19. count() to get frequencies with sorting of count (Slide #61)

# Enter code here
hotels %>%
count(market_segment, sort = TRUE)

20. count() multiple variables (Slide #62)

# Enter code here
hotels %>%
  count(hotel, market_segment)

21. summarise() for summary statistics (Slide #63)

# Enter code here
hotels %>%
  summarise(mean_adr = mean(adr))

22. summarise() by using group_by to find mean (Slide #64)

# Enter code here
hotels %>%
  group_by(hotel) %>%
  summarise(mean_adr = mean(adr))

23. summarise() by using group_by to get count (Slide #65)

# Enter code here
hotels %>%
  group_by(hotel) %>%
  summarise(count = n())

24. summarise() for multiple summary statistics (Slide #67)

# Enter code here
hotels %>%
  summarise(
    min_adr = min(adr),
    mean_adr = mean(adr),
    median_adr = median(adr),
    max_adr = max(adr)
)

25. select(), slice() and arrange() (Slide #68)

# Enter code here
hotels %>%
  select(hotel, lead_time) %>%
  slice(1:5) %>%
  arrange(lead_time)

26. select(), arrange() and slice() (Slide #69)

# Enter code here
hotels %>%
  select(hotel, lead_time) %>%
  arrange(lead_time) %>%
  slice(1:5)

27. filter() to select rows based on conditions (Slide #73)

# Enter code here
hotels %>%
  filter(
    adults == 0,
    children >= 1
    ) %>%
  select(adults, babies, children)

28. filter() to select rows based on complicated conditions (Slide #74)

# Enter code here
hotels %>%
  filter( adults == 1,
          children >= 1 | babies >= 1) %>%   #| means OR
  select(adults, babies, children)

29. count() and arrange() (Slide #76)

# Enter code here
hotels %>%
count(market_segment) %>%
arrange(desc(n)) # <-- decreasing order of counts

30. mutate(), select() and arrange() (Slide #77)

# Enter code here
hotels %>%
mutate(little_ones = children + babies) %>% 
  select(children, babies, little_ones) %>% 
  arrange(desc(little_ones))

31. mutate(), filter() and select() (Slide #78)

# Enter code here
hotels %>%
  mutate(little_ones = children + babies) %>%
  filter(
    little_ones >= 1,
    hotel == "Resort Hotel"
    ) %>%
  select(hotel, little_ones)